Data preparation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- data %>% select(-id, -host_name, -last_review,-name)
colSums(is.na(data))
## host_id neighbourhood_group
## 0 0
## neighbourhood latitude
## 0 0
## longitude room_type
## 0 0
## price minimum_nights
## 0 0
## number_of_reviews reviews_per_month
## 0 10052
## calculated_host_listings_count availability_365
## 0 0
data$reviews_per_month[is.na(data$reviews_per_month)] <- mean(data$reviews_per_month, na.rm = TRUE)
colSums(is.na(data))
## host_id neighbourhood_group
## 0 0
## neighbourhood latitude
## 0 0
## longitude room_type
## 0 0
## price minimum_nights
## 0 0
## number_of_reviews reviews_per_month
## 0 0
## calculated_host_listings_count availability_365
## 0 0
library(corrplot)
## corrplot 0.95 loaded
num_data <- data %>% select_if(is.numeric)
cor_matrix <- cor(num_data)
corrplot(cor_matrix, method = "color", tl.col = "black")

library(ggplot2)
ggplot(data, aes(x=price)) +
geom_histogram(bins = 30, fill = "blue") +
ggtitle("The raw distribution of prices") + labs(x="Price")

data$log_price <- log(data$price+1)
data <- data %>%
select(-price)
ggplot(data, aes(x=log_price)) +
geom_histogram(bins = 30, fill = "blue") +
ggtitle("The raw distribution of prices") + labs(x="Price")

library(dplyr)
library(leaflet)
# Select relevant columns including pre-computed log_price
to_leaflet_pre <- data %>%
select(longitude, latitude, neighbourhood_group, neighbourhood, room_type, log_price)
# Cap log_price for visualization purposes
to_leaflet_pre$log_price[to_leaflet_pre$log_price <= 3] <- 3
to_leaflet_pre$log_price[to_leaflet_pre$log_price >= 7] <- 7
# Center of map
center_lon <- median(to_leaflet_pre$longitude, na.rm = TRUE)
center_lat <- median(to_leaflet_pre$latitude, na.rm = TRUE)
# Color palette for log_price
pal <- colorNumeric(
palette = colorRampPalette(c('red', 'yellow', 'green'))(length(to_leaflet_pre$log_price)),
domain = to_leaflet_pre$log_price
)
# Create interactive map
leaflet(data = to_leaflet_pre) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
popup = ~paste0(
"<strong>Region:</strong> ", neighbourhood_group, ", ", neighbourhood, "<br>",
"<strong>Room type:</strong> ", room_type, "<br>",
"<strong>Log price:</strong> ", round(log_price, 2), "<br>"
),
color = ~pal(log_price),
radius = 5,
fillOpacity = 0.7,
stroke = FALSE,
clusterOptions = markerClusterOptions()
) %>%
setView(lng = center_lon, lat = center_lat, zoom = 10) %>%
addLegend(
position = "topleft",
pal = pal,
values = ~log_price,
title = "Log Prices of Airbnb Listings",
opacity = 1
)
Model Discussion
GAMs
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(caret)
## Loading required package: lattice
set.seed(123)
train_index <- createDataPartition(data$log_price, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
## The following objects are masked from 'package:gam':
##
## gam, gam.control, gam.fit, s
library(rsample)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
## The following objects are masked from 'package:foreach':
##
## accumulate, when
set.seed(123)
folds <- vfold_cv(data, v = 5)
aic_values <- c()
bic_values <- c()
adj_r2_values <- c()
rmse_values <- c()
cv_results <- map_dfr(folds$splits, function(split) {
train_split <- analysis(split)
val_split <- assessment(split)
gam_model <- gam(
log_price ~ s(latitude) + s(longitude) + s(minimum_nights) +
s(number_of_reviews) + s(reviews_per_month) +
s(calculated_host_listings_count) + s(availability_365) +
neighbourhood_group + room_type,
data = train_split
)
preds <- predict(gam_model, newdata = val_split)
rmse_error <- rmse(val_split$log_price, preds)
gam_summary <- summary(gam_model)
aic_value <- AIC(gam_model)
bic_value <- BIC(gam_model)
adj_r2_value <- gam_summary$r.sq
aic_values <<- c(aic_values, aic_value)
bic_values <<- c(bic_values, bic_value)
adj_r2_values <<- c(adj_r2_values, adj_r2_value)
rmse_values <<- c(rmse_values, rmse_error)
data.frame(
AIC = aic_value,
BIC = bic_value,
Adj_R2 = adj_r2_value,
RMSE = rmse_error
)
})
cat("Average AIC:", mean(aic_values), "\n")
## Average AIC: 51093.13
cat("Average BIC:", mean(bic_values), "\n")
## Average BIC: 51637.82
cat("Average Adjusted R²:", mean(adj_r2_values), "\n")
## Average Adjusted R²: 0.55362
cat("Average RMSE:", mean(rmse_values), "\n")
## Average RMSE: 0.4650892
Boosting
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(123)
control <- trainControl(method = "cv", number = 5)
gbm_model <- train(
log_price ~ latitude + longitude + minimum_nights + number_of_reviews +
reviews_per_month + calculated_host_listings_count + availability_365 +
neighbourhood_group + room_type,
data = train_data,
method = "gbm",
trControl = control,
tuneLength = 5,
verbose = FALSE
)
print(gbm_model)
## Stochastic Gradient Boosting
##
## 39117 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 31295, 31295, 31294, 31292, 31292
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 0.5057970 0.4913537 0.3636289
## 1 100 0.4833654 0.5283927 0.3449918
## 1 150 0.4738119 0.5416858 0.3375333
## 1 200 0.4690479 0.5485371 0.3341971
## 1 250 0.4666565 0.5517995 0.3325737
## 2 50 0.4801048 0.5351175 0.3425768
## 2 100 0.4643370 0.5582257 0.3307043
## 2 150 0.4592904 0.5656327 0.3272888
## 2 200 0.4569101 0.5695199 0.3258748
## 2 250 0.4554581 0.5720984 0.3247969
## 3 50 0.4707650 0.5497830 0.3354047
## 3 100 0.4579274 0.5684302 0.3262670
## 3 150 0.4544735 0.5740498 0.3238697
## 3 200 0.4523963 0.5777319 0.3224183
## 3 250 0.4512094 0.5798783 0.3214524
## 4 50 0.4657537 0.5576253 0.3317610
## 4 100 0.4549156 0.5737034 0.3242083
## 4 150 0.4517824 0.5790069 0.3218254
## 4 200 0.4501156 0.5819272 0.3205804
## 4 250 0.4489459 0.5840324 0.3197992
## 5 50 0.4620750 0.5632416 0.3291552
## 5 100 0.4524728 0.5779501 0.3224659
## 5 150 0.4497843 0.5826146 0.3202886
## 5 200 0.4480907 0.5856444 0.3189426
## 5 250 0.4467777 0.5880243 0.3179834
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
summary(gbm_model)

## var rel.inf
## room_typePrivate room room_typePrivate room 48.97834882
## longitude longitude 13.03437708
## room_typeShared room room_typeShared room 10.49956878
## latitude latitude 8.73159804
## availability_365 availability_365 5.84517929
## minimum_nights minimum_nights 3.94520960
## neighbourhood_groupManhattan neighbourhood_groupManhattan 3.35662901
## number_of_reviews number_of_reviews 2.31646769
## calculated_host_listings_count calculated_host_listings_count 2.27427586
## reviews_per_month reviews_per_month 0.87305383
## neighbourhood_groupBrooklyn neighbourhood_groupBrooklyn 0.11792785
## neighbourhood_groupQueens neighbourhood_groupQueens 0.02736416
## neighbourhood_groupStaten Island neighbourhood_groupStaten Island 0.00000000
par(mfrow = c(1, 2))
plot(gbm_model$finalModel, i = "longitude", main = "Partial Dependence: longitude")

par(mfrow = c(1, 2))
plot(gbm_model$finalModel, i = "latitude", main = "Partial Dependence: latitude")

train_preds <- predict(gbm_model, newdata = train_data)
test_preds <- predict(gbm_model, newdata = test_data)
# RMSE
rmse_train <- sqrt(mean((train_data$log_price - train_preds)^2))
rmse_test <- sqrt(mean((test_data$log_price - test_preds)^2))
# R²
r2_train <- 1 - sum((train_data$log_price - train_preds)^2) / sum((train_data$log_price - mean(train_data$log_price))^2)
r2_test <- 1 - sum((test_data$log_price - test_preds)^2) / sum((test_data$log_price - mean(test_data$log_price))^2)
# Number of predictors (excluding intercept)
p <- length(gbm_model$finalModel$var.names)
# Sample sizes
n_train <- nrow(train_data)
n_test <- nrow(test_data)
# Adjusted R²
adj_r2_train <- 1 - ((1 - r2_train) * (n_train - 1)) / (n_train - p - 1)
adj_r2_test <- 1 - ((1 - r2_test) * (n_test - 1)) / (n_test - p - 1)
cat("Train RMSE:", rmse_train, "\n")
## Train RMSE: 0.4356909
cat("Test RMSE:", rmse_test, "\n")
## Test RMSE: 0.4445946
cat("Train Adjusted R²:", adj_r2_train, "\n")
## Train Adjusted R²: 0.6080001
cat("Test Adjusted R²:", adj_r2_test, "\n")
## Test Adjusted R²: 0.5874943